Reproducible analysis of HGDP

library(knitr)
library(genio)
library(ggtree)
library(popkin)
library(BEDMatrix)
library(superadmixture)
library(tidyverse)

# for plotting
library(ggplot2)
library(grid)
library(gridGraphics)
library(ggplotify)
library(ggpubr)
library(patchwork)
library(latex2exp)

1 Data Preprocessing

In this document, we demonstrate our procedures for analysis of Human Genome Diversity Project (HGDP) - CEPH Panel. The raw HGDP is available at the webpage, attributed to Bergström et al. (2020). Here we start with the assembled version of HGDP, provided by the PLINK. PLINK hosts the assembled data at the following dropbox link:

The associated annotations can be found at the FTP site: ftp://ngs.sanger.ac.uk/production/hgdp/hgdp_wgs.20190516/metadata/hgdp_wgs.20190516.metadata.txt

We use PLINK2 and zstd for data preprocessing. The PLINK2 software can be downloaded from this webpage. The ztsd is hosted at the github, and can downloaded and installed through the following commands.

git clone https://github.com/facebook/zstd.git
cd zstd
make install

We download and unzip the assembled HGDP data from the dropbox stated above. We create a folder called rawdata to host the downloaded raw data and pre-processed data.

mkdir -p rawdata && cd $_

zstd=<PATH_TO_ZSTD_EXECUTABLE>

## download `.pgen` file
wget "https://www.dropbox.com/s/hppj1g1gzygcocq/hgdp_all.pgen.zst?dl=1"
mv "hgdp_all.pgen.zst?dl=1" "hgdp_all.pgen.zst"
./$zstd -d "hgdp_all.pgen.zst"
rm "hgdp_all.pgen.zst"

## download `.pvar` file
wget "https://www.dropbox.com/s/1mmkq0bd9ax8rng/hgdp_all.pvar.zst?dl=1"
mv "hgdp_all.pvar.zst?dl=1" "hgdp_all.pvar.zst"

## download `.psam` file
wget "https://www.dropbox.com/s/0zg57558fqpj3w1/hgdp.psam?dl=1"
mv "hgdp.psam?dl=1" "hgdp_all.psam"

## download metadata
 wget "ftp://ngs.sanger.ac.uk/production/hgdp/hgdp_wgs.20190516/metadata/hgdp_wgs.20190516.metadata.txt"

This step serves set missing IDs to unique values to avoid these being detected as repeated IDs

PLINK2=<PATH_TO_PLINK2_EXECUTABLE>

./$PLINK2 --pfile hgdp_all vzs \
          --set-missing-var-ids '@:#' \
          --make-just-pvar zs \
          --out hgdp_all_uniq
          
# replace data
mv hgdp_all_uniq.pvar.zst hgdp_all.pvar.zst

# trash
rm hgdp_all_uniq.log

This step serves to convert files from zvs format to PLINK binary format.

./$PLINK2 \
    --pfile hgdp_all vzs \
    --var-filter \
    --snps-only just-acgt \
    --max-alleles 2 \
    --make-bed \
    --out hgdp_wgs_autosomes

Then we add subpopulation labels to FAM files.

awk -F" " '{
  if (NR == FNR) {
    population[$1]=$6;
    if ($10 == "M") {
      gender[$1]=1;
    } else if ($10 == "F") {
      gender[$1]=2;
    } else {
      gender[$1]=0;
    }
  } 
  else print population[$2],$2,$3,$4,gender[$2],$6;
}' hgdp_wgs.20190516.metadata.txt hgdp_wgs_autosomes.fam > hgdp_wgs_autosomes.fam.NEW
mv hgdp_wgs_autosomes.fam.NEW hgdp_wgs_autosomes.fam

This step serves to preserve loci that:

  1. have MAF >= 0.01
  2. are in approximate linkage equilibrium with each other
./$PLINK2 \
    --bfile hgdp_wgs_autosomes \
    --make-bed \
    --out hgdp_wgs_autosomes_maf_0.01 \
    --maf 0.01
    
## this command determines the loci to keep or exclude
./$PLINK2 --bfile hgdp_wgs_autosomes_maf_0.01 --indep-pairwise 1000kb 0.3 --out hgdp_wgs_autosomes_maf_0.01
  
## this actually filters the data
./$PLINK2 \
    --bfile hgdp_wgs_autosomes_maf_0.01 \
    --extract hgdp_wgs_autosomes_maf_0.01.prune.in \
    --make-bed \
    --out hgdp_wgs_autosomes_final

rm hgdp_wgs_autosomes.{bed,bim,fam,log}
rm hgdp_wgs_autosomes_maf_0.01.{bed,bim,fam,log,prune.in,prune.out}
rm hgdp_wgs_autosomes_final.log

2 Estimating individual-level coancestry

In the following sections, the intermediate data will be stored at the rdata folder.

X    <- BEDMatrix::BEDMatrix("rawdata/hgdp_wgs_autosomes_final", simple_names = TRUE)
fam  <- read_fam( "rawdata/hgdp_wgs_autosomes_final")
info <- read_tsv( "rawdata/hgdp_wgs.20190516.metadata.txt", col_types = 'ccccccddccddddd') %>%
  dplyr::select(c('sample', 'population', 'latitude', 'longitude', 'region'))

Here we demonstrate how to estimate the individual-level coancestry \(\boldsymbol{\Theta}\) according to the Ochoa-Storey (OS) method by popkin package. So here we only presents the codes but not running them. We provide the pre-computed coancestry at rdata/coanc_os.rda. It should noted that the popkin function returns the kinship coefficients instead of the coancestry coefficients. Therefore, we use the inbr_diag function in the popkin package to map kinship coefficients \(\phi_{jk}\)’s to coancestry coefficients \(\theta_{jk}\)’s:

\[ \theta_{jk} = \begin{cases} 2\phi_{jk} - 1 & j = k \\ \phi_{jk} &j \neq k \end{cases}. \]

fam <- dplyr::left_join(fam, info, by = c("id" = "sample")) %>%
       dplyr::rename("subpop" = "region") %>%
       dplyr::mutate(subpop = ifelse(subpop == "AFRICA", "Africa",
                              ifelse(subpop == "MIDDLE_EAST", "MiddleEast",
                              ifelse(subpop == "EUROPE", "Europe",
                              ifelse(subpop == "CENTRAL_SOUTH_ASIA", "CSAsia",
                              ifelse(subpop == "EAST_ASIA", "EAsia",
                              ifelse(subpop == "AMERICA", "Americas", "Oceania")))))))


# reorder by subpop
subpop_order <- c('Africa', 'MiddleEast', 'Europe', 'CSAsia', 'EAsia', 'Americas', 'Oceania')
index <- order( match(fam$subpop, subpop_order) )
fam   <- fam[index, ]
X     <- X[index, ]

# Estimate kinship
obj         <- popkin::popkin_A(t(X))
A_min       <- popkin::popkin_A_min_subpops(obj$A, subpops =  fam$fam)
kinship_os  <- 1 - obj$A / A_min    
coanc_os    <- popkin::inbr_diag(kinship_os)
coanc_os  <- ifelse(coanc_os < 0, 0, coanc_os)
save(coanc_os,   file = "rdata/coanc_os.rda")
save(fam,        file = "rdata/fam.rda")
save(X,          file = "rdata/X.rda")

We can visualize the individual-level coancestry of the simulated data using plot_popkin function in the popkin function. We use the following helper function plot_colors_subpops to label the sub-populations.

plot_colors_subpops <- function(pops, srt = 0, cex = 0.6, y = FALSE) {
  n <- length(pops)
  k <- unique(pops)
  pops <- factor(pops, levels = unique(pops))
  xintercept <- cumsum(table(pops))
  breaks <- xintercept - 0.5 * as.numeric(table(pops))
  if (y) {
    plot(NULL, xlim = c(0, 1), ylim = c(1, n), axes = FALSE, ann = FALSE, xaxs = "i", yaxs = "i")
    text(1, n-rev(breaks), rev(unique(pops)), cex = cex, srt = srt, xpd = TRUE, adj = c(1, 0.5))
  } else {
    plot(NULL, xlim = c(1, n), ylim = c(0, 1), axes = FALSE, ann = FALSE, xaxs = "i", yaxs = "i")
    text(breaks, 1, unique(pops), cex = cex, srt = srt, xpd = TRUE, adj = c(1, 0.5))
  }
}
load("rdata/coanc_os.rda")
load("rdata/fam.rda")

par(mar = c(0, 0, 0, 0) + 0.2)
layout(rbind(c(3, 1, 2), c(3, 1, 5), c(0, 4, 0)), widths = c(0.2, 1, 0.2), heights = c(0.5, 0.5, 0.3))

popkin::plot_popkin(kinship = coanc_os, layout_add = FALSE, leg_cex = 0.8, labs_text = FALSE, labs_lwd = 0.1, labs = fam$subpop, ylab = '', leg_title = "Coancestry")

par(mar = c(0.2, 0, 0.2, 0))
plot_colors_subpops(fam$subpop, y = TRUE, cex = 0.8)
mtext('Individuals', side = 2, line = 0.5, xpd = NA, cex = 0.8)
par(mar = c(0, 0.2, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 0.8)

3 Estimating admixture proportions

The following chunk of the codes estimates the admixture proportions \(\boldsymbol{Q}\) from genotypes. We first estimate the individual specific allele frequencies \(\boldsymbol{\Pi}\) using the est_p_indiv function in the superadmixture package. We then estimate \(\boldsymbol{Q}\) by decomposing \(\boldsymbol{\Pi}\) with factor_p_indiv function in the superadmixture package. It takes hours to run the following codes, so we’re not running here.

for (k_antepops in 2:15) {
    # estimate individual-specific allele frequencies
    obj <- est_p_indiv(X, k_antepops, loci_on_cols = TRUE)
    p_indiv <- obj$p_indiv
    rowspace <- obj$rowspace
    
    # estimate admix_props by decomposing individual-specific allele frequencies
    obj <- factor_p_indiv(p_indiv = p_indiv,
                          rowspace = rowspace,
                          k_antepops = k_antepops,
                          init_method = "rowspace",
                          verbose = TRUE,
                          max_iters = 1000,
                          tol = 1e-4)
    
    # save admixture proportions 
    Q_hat <- obj$Q_hat
    dir.create(file.path("rdata", "Q_hat"), showWarnings = FALSE)
    save(Q_hat, file = paste0("rdata/Q_hat/", k_antepops, ".rda"))
}

4 Selecting number of antecedent populations

The following chunk of the codes calculates the p-values of structured Hardy-Weinberg Equilibrium test (sHWE) and plots the relationship between the negative entropy of the sHWE p-values and \(K\). It takes hours to run the following codes, so we’re not running here.

for (k_antepops in 2:15) {
    # estimate `rowspace`, an input for sHWE function
   rowspace <- est_p_indiv(X, k_antepops, loci_on_cols = TRUE, rowspace_only = TRUE)
   
   # perform sHWE test
   pvals <- sHWE(X, k_antepops, loci_on_cols = TRUE, rowspace = rowspace) 
   save(pvals, file = paste0("rdata/sHWE/", k_antepops, ".rda"))
}

neg_entropy <- c()
for (k_antepops in 2:15) {
    load(paste0("rdata/sHWE/", k_antepops, ".rda"))
    neg_entropy <- c(neg_entropy, calc_neg_entropy(pvals))
}
save(neg_entropy, file = "rdata/neg_entropy.rda")

We visualize the relationship between \(K\) and the negative entropy of sHWE p-values.

load("rdata/neg_entropy.rda")
plot(2:15, neg_entropy, ylim = c(-2.2, -1.85), xlab = "", ylab = "", type = "b", family="Times New Roman")
mtext(latex2exp::TeX(r"(\textit{$K$})"), side = 1, col = "black", line = 2.5, family="Times New Roman", pch = 10)
mtext("Negative Entropy", side = 2, line = 2.5, col = "black",  family="Times New Roman")

We select \(K = 7\) according to this plot.

5 Estimating the coancestry among antecedent populations

After obtaining individual-level coancestry \(\boldsymbol{\Theta}\) and admixture proportions \(\boldsymbol{Q}\), we can use the function est_coanc to estimate population coancestry under the super admixture and standard admixture.

load("rdata/coanc_os.rda")
load("rdata/Q_hat/7.rda")

coanc_antepops_sup <- est_coanc(coanc_os, Q_hat, model = "super")
coanc_antepops_std <- est_coanc(coanc_os, Q_hat, model = "standard")

We then compute the corresponding individual-level coancestry under the super admixture and standard admixture.

coanc_sup <- t(Q_hat) %*% coanc_antepops_sup %*% Q_hat
coanc_std <- t(Q_hat) %*% coanc_antepops_std %*% Q_hat

6 Visualization

We can visualize the coancestry of antecedent populations coanc_antepops_sup and admixture proportions Q_hat using the helper functions get_seq_colors, plot_tree, barplot_admix and heatmap_coanc_antepops we provide.

# reorder antecedent populations according to the value of the population inbredding
index        <- order(diag(coanc_antepops_sup))
Q_hat        <- Q_hat[index, ]
k_antepops   <- 7
coanc_antepops_sup <- coanc_antepops_sup[index, index]

# label antecedent populations
colnames(coanc_antepops_sup) <- rownames(coanc_antepops_sup) <- paste0("S", 1:k_antepops)

# fit tree
tree <- bnpsd::fit_tree(round(coanc_antepops_sup, 6))

# plot an uncolorred tree
plot_tree(tree)

Based on the topology of the tree presented in the previous code chunk, we decide to color the populations \(S_1\), \(S_2\) by light blue and dark blue, \(S_3\), \(S_4\) by light green and dark green, \(S_5\) and \(S_6\) by light red and dark red and \(S_7\) by purple.

colors <- c(get_seq_colors("Blues", 2), get_seq_colors("Greens", 2), get_seq_colors("Reds", 2), get_seq_colors("Purples", 1))
names(colors) <- paste0("S", 1:7)

fig_tree <- plot_tree(tree, colors = colors, font_size = 15)

# flip tree 
fig_tree <- ggtree::flip(fig_tree, 
                         get_current_node("S7", tree), 
                         get_parent_node("S5", tree))

fig_tree

We can visualize admixture proportions Q_hat using the barplot_admix function.

fig_admix <- barplot_admix(Q_hat, colors = colors, subpops = fam$subpop)
fig_admix 

We also can visualize the coancestry among antecedent populations by heatmap_coanc_antepops. We define the helper function heatmap_coanc_antepops_wrapper that returns a ggplot object of the heatmap.

heatmap_coanc_antepops_wrapper <- function(coanc_antepops, tl.cex = 1.3, cl.cex = 1, tl.offset = 0.6) {
  superadmixture::heatmap_coanc_antepops(coanc_antepops, tl.cex = tl.cex, cl.cex = cl.cex, tl.offset = tl.offset)
  grab_grob <- function(){
    gridGraphics::grid.echo()
    grid::grid.grab()
  }
  p <- grab_grob()
 
 # save correlation matrix colors to a vector, then make coloured matrix grob transparent
 matrix.colors <- grid::getGrob(p, grid::gPath("square"), grep = TRUE)[["gp"]][["fill"]]
 p <- grid::editGrob(p, grid::gPath("square"), grep = TRUE, gp = grid::gpar(col = NA, fill = NA))

 # apply the saved colours to the underlying matrix grob
 p <- grid::editGrob(p, grid::gPath("symbols-rect-1"), grep = TRUE, gp = grid::gpar(fill = matrix.colors))

 # convert the background fill from white to transparent, while we are at it
 p <- grid::editGrob(p, grid::gPath("background"), grep = TRUE, gp = grid::gpar(fill = NA))
 p <- ggplotify::as.ggplot(p) + ggplot2::theme(plot.margin = unit(c(0, 0, 0, 0), "pt"))
 return(p)
}
par(xpd = TRUE)
fig_coancestry <- heatmap_coanc_antepops_wrapper(coanc_antepops_sup, tl.cex = 1.2, tl.offset = 1)
fig_coancestry <- fig_coancestry + theme(plot.margin = margin(0, 0, 0, 0, "pt"))
coancestry_lab <- ggplot() + 
  annotate("text", x = 0.5, y = 0.5, size = 4.5, label = "Coancestry") + 
  xlim(0, 1) + 
  ylim(0, 1) + 
  theme_void()
fig_coancestry <- ggarrange(fig_coancestry, coancestry_lab, ncol = 1, heights = c(1, 0.1))
fig_coancestry

We combine all plots.

# combine plots of tree, admix props, subpops and coancestry
design <- "
  112
  113
  ##3
"

fig_coancestry + fig_tree + fig_admix + 
  patchwork::plot_layout(
            design = design,
            widths = c(0.7, 0.2, 1.4), 
            heights = c(0.7, 0.2, 0.4)) +
  patchwork::plot_annotation(tag_levels = 'A', tag_prefix = '(', tag_suffix = ')') &
  ggplot2::theme(plot.tag = ggplot2::element_text(color = "black", size = 21, face = 'bold'))

7 Simulating genotypes from the structure of HGDP

We can simulate genotypes using the double-admixture method with the dbl_admixture function. It takes about an hour to run the following codes, so we’re not running here.

X <- as.matrix(X)
p_anc <- 0.5 * colMeans(X, na.rm = TRUE)
coanc_antepops <- round(coanc_antepops, 6)
X_sim <- dbl_admixture(p_anc, coanc_antepops, Q_hat, geno_only = TRUE)

## estimate kinship
obj         <- popkin::popkin_A(X_sim)
A_min   <- mean(sort(obj$A[lower.tri(obj$A)])[1:100])
kinship_sim_os  <- 1 - obj$A / A_min

## mapping kinship coefficients to coancestry coefficients
coanc_sim_os  <- popkin::inbr_diag(kinship_sim_os)
coanc_sim_os  <- ifelse(coanc_sim_os < 0, 0, coanc_sim_os)
save(coanc_sim_os,  file = "rdata/coanc_sim_os.rda")

We can also use NORTA method to simulate genotypes. It takes a few hours to run the following codes, so we’re not running here.

X_sim_norta <- norta_approx(p_anc, coanc_antepops, Q_hat,
              geno_only = TRUE, parallel = TRUE, method = "numeric", tol = 1e-5, mc_cores = 20)

## estimate kinship
obj         <- popkin::popkin_A(X_sim_norta)
A_min   <- mean(sort(obj$A[lower.tri(obj$A)])[1:100])
kinship_sim_os_norta  <- 1 - obj$A / A_min

## mapping kinship coefficients to coancestry coefficients
coanc_sim_os_norta    <- popkin::inbr_diag(kinship_sim_os_norta)
coanc_sim_os_norta    <- ifelse(coanc_sim_os_norta < 0, 0, coanc_sim_os_norta)
save(coanc_sim_os_norta,   file = "rdata/coanc_sim_os_norta.rda")

We plot the individual-level coancestry.

load("rdata/coanc_sim_os.rda")
load("rdata/coanc_sim_os_norta.rda")

par(mar = c(0, 0, 0, 0) + 0.2)
layout(rbind(
   c(14, 19, 15, 20, 16, 21,  0),
   c( 7,  1,  0,  2,  0,  3,  6),
   c( 0,  9,  0, 10,  0, 11,  0),
   c(17, 22, 18, 23,  0,  0,  0),
   c( 8,  4,  0,  5,  0,  0,  0),
   c( 0, 12,  0, 13,  0,  0,  0)),
  heights = c(0.2, 1, 0.22, 0.2, 1, 0.22),
  widths  = c(0.2, 1, 0.08, 1, 0.08, 1, 0.2))

coanc_sim_os       <- (coanc_sim_os - 1) * (1 - min(coanc_sup)) + 1
coanc_sim_os_norta <- (coanc_sim_os_norta - 1) * (1 - min(coanc_sup)) + 1
# We truncate the large entries of `coanc_std`
# coanc_std  <- ifelse(coanc_std > max(coanc_os), max(coanc_os), coanc_std)

popkin::plot_popkin(kinship = list(coanc_os, coanc_sup, coanc_std, coanc_sim_os, coanc_sim_os_norta),
                    layout_add = FALSE,
                    leg_cex = 0.8,
                    labs_text = FALSE,
                    labs_lwd = 0.1,
                    labs = fam$subpop,
                    ylab = '',
                    leg_title = "Coancestry",
                    panel_letters = NULL)

par(mar = c(0.5, 0.5, 0.2, 0))
plot_colors_subpops(fam$subpop, y = TRUE, cex = 1.1)
mtext('Individuals', side = 2, line = 0.5, xpd = NA, cex = 0.8)

par(mar = c(0.5, 0.5, 0.2, 0))
plot_colors_subpops(fam$subpop, y = TRUE, cex = 1.1)
mtext('Individuals', side = 2, line = 0.5, xpd = NA, cex = 0.8)

par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)

par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(A)", cex = 2, xpd = TRUE, font = 2)

par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(B)", cex = 2, xpd = TRUE, font = 2)

par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(C)", cex = 2, xpd = TRUE, font = 2)

par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(D)", cex = 2, xpd = TRUE, font = 2)

par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(E)", cex = 2, xpd = TRUE, font = 2)

plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "OS individual coancestry\nof real data", cex = 1.6)

plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "Super admixture individual coancestry\n of real data", cex = 1.6)

plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "Standard admixture individual coancestry\n of real data", cex = 1.6)

plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "OS individual coancestry\nof bootstrap data (double-admixture)", cex = 1.6)

plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "OS individual coancestry\nof bootstrap data (NORTA)", cex = 1.6)

8 Confirming significant hypothesis tests of standard admixture versus super admixture

We perform the significance test of coancestry among antecedent populations to compare the model fit between standard admixture versus super admixture. The following chunk of codes is used to generate the null distribution of the test statistics \(U\). It should be noted that it takes days to run this chunk of code. We submit each iteration as a separate cluster job to speed up (not show here). Therefore, we’re just showing the code but not running it. We provide the pre-computed null test statistics in the file rdata/test_stats0.rda.

for (task_id in 1:1000) {
  ## compute null test statistics
  inbr_antepops <- diag(est_coanc(coanc_os, Q_hat, model = "standard"))

  ## compute null
  test_stats0 <- compute_null(p_anc, Q_hat, inbr_antepops, verbose = TRUE)

  # save results
  dir.create(file.path("rdata", "test_stat0"), showWarnings = FALSE)
  save(test_stat0, file = paste0("rdata/test_stat0/", task_id, ".rda"))
}

test_stats0 <- c()
for (task_id in 1:1000) {
  load(paste0("rdata/test_stat0/", task_id, ".rda"))
  test_stats0 <- c(test_stats0, test_stat0)
}
save(test_stats0, file = "rdata/test_stats0.rda")

We plot the distribution of null test statistics and label our observed test statistics.

load("rdata/test_stats0.rda")
load("rdata/coanc_os.rda")
load("rdata/Q_hat/7.rda")

coanc_antepops_sup <- est_coanc(coanc_os, Q_hat, model = "super")
coanc_antepops_std <- est_coanc(coanc_os, Q_hat, model = "standard")

test_stat1 <- norm(coanc_antepops_sup - coanc_antepops_std, "F")
p <- data.frame(test_stats0 = test_stats0) %>%
     ggplot(data, mapping = aes(x = test_stats0)) +
        geom_histogram(aes(y =..density..), 
                 fill  = "dodgerblue3", 
                 bins  = 10,
                 binwidth = NULL,
                 color = "black", 
                 alpha = 0.7, 
                 size  = 0.1) +
    labs(x = latex2exp::TeX(r"(Null test-statistics)"), y='Density') +
    scale_y_continuous(limits = c(0, 250)) +
    theme_bw(base_size = 16) +
    ggtitle("HGDP") +
    theme(panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(), 
    plot.title = element_text(hjust = 0.5, size=16)) +
    annotate("text", x = 0.0465, y = 240, size = 6, label = latex2exp::TeX(sprintf("$U_{obs}$ = %.3f", test_stat1)))
p